Importing libraries
library(dplyr)
library(readxl)
library(tidygeocoder)
library(sf)
library(mapview)
library(RColorBrewer)
library(plotly)
Importing data
data <- read_excel("geo_NCdata.xlsx")
nr_data <- select(data, c("City", "Availability of Water", "Agricultural Potential",
"Mining Potential", "Tourism Potential", "Environmental Sensitivity",
"latitude", "longitude"))
head(nr_data)
## # A tibble: 6 x 8
## City `Availability of~ `Agricultural Po~ `Mining Potenti~ `Tourism Potent~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Aggeney~ 19.2 26.4 70.3 16.2
## 2 Alexand~ 34.4 20.1 66.4 54.0
## 3 Askham,~ 19.0 32.7 34.3 54.4
## 4 Augrabi~ 34.3 45.6 30.7 54.2
## 5 Barkly ~ 42.0 56.1 76.5 41.2
## 6 Brandvl~ 17.0 23.1 28.1 17.7
## # ... with 3 more variables: Environmental Sensitivity <dbl>, latitude <dbl>,
## # longitude <dbl>
Availability of water: Rainfall, Dams, Perennial rivers, Ground water potential, BoreholesAgricultural Potential: Crop production / Irrigated land, Grazing Capacity, Agro-processing facilities, Land capability, Aridity zonesMining Potential: Active mines, Mineral deposits, Mining applicationsTourism Potential: Terrain index, Cultural and heritage sitesEnvironment Sensitivity: Protected and Conservation Areas, Biodiversity and Geohazards, NFEPA rivers and wetlands.fig <- nr_data %>%
plot_ly(
y = ~`Availability of Water`,
type = 'violin',
box = list(visible = T),meanline = list(visible = T), x0 = 'Availability of Water')
fig <- fig %>%
layout(
title = "Distribution of Availability of Water",
yaxis = list(title = "%", zeroline = F))
fig
Cities/Towns that are not geocoded
nr_data[rowSums(is.na(nr_data)) > 0,]$City
## [1] "Delpoortshoop, Northern Cape" "Olynvenhoutsdrif, Northern Cape"
## [3] "Phillipstown, Northern Cape" "Soverby, Northern Cape"
Removing Cities that are not geocoded
locations_nr <- subset(nr_data, !is.na(nr_data$longitude) & !is.na(nr_data$latitude))
Clustering is a broad set of techniques for finding subgroups of observations within a data set. When we cluster observations, we want observations in the same group to be similar and observations in different groups to be dissimilar. Because there isn’t a response variable, this is an unsupervised method, which implies that it seeks to find relationships between the n observations without being trained by a response variable. Clustering allows us to identify which observations are alike, and potentially categorize them therein. K-means clustering is the simplest and the most commonly used clustering method for splitting a dataset into a set of k groups. In this case, clustering will aid in finding Cities/Towns with similar Natural Resources.
k-means clustering is a method of vector quantization, originally from signal processing, that aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster.
Clustering is the process of grouping data objects using a similarity measure.
Clustering can be hierarchical or partitional, exclusive, overlapping or fuzzy, and complete or partial.
K-Means is a partitional clustering technique; data objects are divided into non-overlapping groups.
K-Means is a prototype-based clustering
A prototype-based cluster is represented by a prototype such that all members within a cluster are close to the corresponding prototype.
Centroid and medoid are two commonly used prototypes.
K-Means clustering learns properties of a set of data points and forms partitions called clusters, that represent data with similar properties. For continuous data, each cluster is represented by the centroid which is the mean of cluster members.
locations_nr_scale <- scale(select(locations_nr,
c("Availability of Water", "Agricultural Potential",
"Mining Potential", "Tourism Potential", "Environmental Sensitivity")))
#hopkins(locations_nr_scale, n = nrow(locations_nr_scale)-1)
library(factoextra)
fviz_nbclust(locations_nr_scale, kmeans, method = "wss")
fviz_nbclust(locations_nr_scale, kmeans, method = "silhouette")
set.seed(123)
locations_nr_cluster <- kmeans(locations_nr_scale,
centers = 7, nstart = 25)
library(ggplot2)
library(plotly)
ggplotly(fviz_cluster(locations_nr_cluster, data = locations_nr_scale) +
theme_minimal() +
theme(legend.position = "none") +
ggtitle("Natural Resource Clusters (Groups)"))
Adding the clusters to the Natural Resource Data Frame
locations_nr$Cluster <- as.factor(locations_nr_cluster$cluster)
head(locations_nr)
## # A tibble: 6 x 9
## City `Availability of~ `Agricultural Po~ `Mining Potenti~ `Tourism Potent~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Aggeney~ 19.2 26.4 70.3 16.2
## 2 Alexand~ 34.4 20.1 66.4 54.0
## 3 Askham,~ 19.0 32.7 34.3 54.4
## 4 Augrabi~ 34.3 45.6 30.7 54.2
## 5 Barkly ~ 42.0 56.1 76.5 41.2
## 6 Brandvl~ 17.0 23.1 28.1 17.7
## # ... with 4 more variables: Environmental Sensitivity <dbl>, latitude <dbl>,
## # longitude <dbl>, Cluster <fct>
nr_clust <- select(locations_nr, c("Availability of Water", "Agricultural Potential",
"Mining Potential", "Tourism Potential", "Environmental Sensitivity"))
nr_clust_table <- aggregate(nr_clust,
by=list(cluster= locations_nr_cluster$cluster),
mean)
nr_clust_table
## cluster Availability of Water Agricultural Potential Mining Potential
## 1 1 24.58120 28.87901 52.73583
## 2 2 39.71313 57.21372 68.73526
## 3 3 34.99344 38.45660 35.36125
## 4 4 24.66475 34.42678 27.20825
## 5 5 33.87909 42.32195 75.05238
## 6 6 35.43283 46.15791 38.63769
## 7 7 20.51299 26.72645 25.11277
## Tourism Potential Environmental Sensitivity
## 1 41.46333 31.25629
## 2 25.83522 37.47207
## 3 22.59556 42.73840
## 4 28.99601 63.86612
## 5 15.95009 58.56706
## 6 53.02939 37.06212
## 7 53.50389 42.81456
# locations_nr %>%
# group_by(Cluster) %>%
# summarise(n = n()) %>%
# arrange(n) %>%
# mutate(Cluster = factor(Cluster, levels = unique(Cluster))) %>%
# plot_ly(x = ~n, y = ~Cluster, type = "bar") %>%
# layout(title = "Natural Resource Grouping", yaxis = list(title = "Cluster"),
# xaxis = list(title = "Number of Cities/Towns"))
ggplotly(locations_nr %>%
group_by(Cluster) %>%
summarise(No_of_Cities = n()) %>%
arrange(No_of_Cities) %>%
mutate(Cluster = factor(Cluster, levels = unique(Cluster))) %>%
ggplot(aes(x = Cluster, y = No_of_Cities)) +
geom_bar(stat = "identity",
fill = "#1f77b4") +
geom_text(aes(label = No_of_Cities),
vjust = -0.25) +
coord_flip() +
labs(x = "Cluster",
y = "Number of Cities/Towns",
title = "Natural Resource Grouping (Clusters)") +
theme_minimal())
Natural_Resource <- st_as_sf(locations_nr, coords = c("longitude", "latitude"), crs = 4326)
mapview(Natural_Resource,
zcol = "Cluster")